from BasicLibraries import *
from Functions.Miscellaneous import utils as ut
from statsmodels.discrete.discrete_model import Probit
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)


'''
This is the script to run all the regressions
'''


def filter_outlier( X, var_to_filter):
    X[var_to_filter] = X[var_to_filter][X[var_to_filter] < X[var_to_filter].quantile(0.98)]
    X = X.loc[X[var_to_filter].dropna().index].reset_index(drop=True)
    return X
def make_effort_BRD_ass(dt, dummies, years_FE=True):
    Z = dt.copy()
    targets = ['diversification', 'Risk mitigation', \
                  'Stakeholders engagement', 'Innovation']
    driver = 'total_effort'
    controls = [driver, 
                'Profitability', 
                'MTB', 
                'Tangibility',
                'firm_size',\
                'Leverage',
                'DirectControl_lag',
                'Mills_Ratio']
    controls = [T+'_rw' for T in controls[:-1]] + ['Mills_Ratio']

    alternative_dummies = dummies.copy()
    alternative_dummies.remove('Estimated')
    alternative_dummies.remove('Mixed')
    L = pd.DataFrame()
    for target_var in targets:
        M = Z.copy()
        controls_adj = controls.copy()
        M['total_effort']  = M['total_effort'].apply(np.log)           

        M = M[[target_var] + controls_adj + dummies + ['gvkey']].dropna()
        X = M[controls_adj+dummies]
        Y = M[target_var]
        
        
        lm = RLM(Y, X, M=HuberT()).fit(cov='H1')
        params = lm.params.loc[controls_adj]*X[controls_adj].std()/Y.std()
        res = pd.DataFrame([str(np.round(params.loc[i],2))+ut.utils().significance(lm.pvalues.loc[i]) for i in controls], index= [controls_adj], columns = [target_var.capitalize()])
        
        L = pd.concat((L, res), axis = 1)
    adjust_names = {'firm_size_rw': 'Size', 
                    'DirectControl_lag_rw': 'GHG emissions (lag)',
                    'Profitability_rw': 'Profitability',
                    'MTB_rw': 'Market-to-Book',
                    'competition': 'Competition',
                    'Tangibility_rw': 'Tangibility',
                    'Leverage_rw': 'Leverage',
                    'total_effort_rw': 'Number of initiatives'}
    L = L.rename(index = adjust_names)
    L = L.drop(index = ['Mills_Ratio'])
    L.loc['Asset controls',:] = ['Yes']*len(L.columns)
    L.loc['Country fixed-effects', :] = ['Yes']*len(L.columns)
    L.loc['Year fixed-effects', :] = ['Yes']*len(L.columns)
    L.loc['Sector fixed-effects', :] = ['Yes']*len(L.columns)
    L.loc['Source of emissions', :] = ['Yes']*len(L.columns)
    L.loc['Self-selectivity', :] = ['Yes']*len(L.columns)

    
    L = L.loc[['Number of initiatives', 'Asset controls', 'Country fixed-effects',
               'Year fixed-effects', 'Sector fixed-effects',
               'Source of emissions', 'Self-selectivity']]
    L = L.rename(columns = {"Diversification": 'Response diversity'})
    print(L.to_latex(escape = False).replace('Asset ', '\midrule \nAsset ').replace('Number of initiatives ', r'{\bf Number of initiatives} ').replace('Profitability', '\midrule \nProfitability'))
        
    return L


def make_impact_emission_regression(dt, dummies, emission_var, years_FE=True, standardise_coefficients=True, all_targets=True):
    if all_targets:
        targets = ['diversification',  'Risk mitigation', \
                   'Stakeholders engagement', 'Innovation']
    else:
        targets = ['diversification']

    controls = ['competition', 
                'total_effort', 
                'Tangibility', 
                'firm_size',
                'Leverage',
                'MTB',
                'DirectControl_lag'] 
    

    index_name = '' 
    tab = pd.DataFrame()
    tabNUM = pd.DataFrame()
    fg  = pd.DataFrame()
    models = []
    baseline = []
    for target_var in targets:
        M = dt.copy()
        controls_adj = controls.copy()
        M['total_effort']  = M['total_effort'] .apply(np.log)  
        M['DirectControl_lag']  = scale(M['DirectControl_lag'])
        print('Number of observations:', len(M))

        M = M.replace([np.inf, -np.inf], [np.nan, np.nan])
        M = M[[emission_var, target_var] + controls_adj + dummies + ['gvkey']].dropna()
        M[emission_var] = M[emission_var].apply(np.log)

        print('Number of observations:', len(M))
        X = tools.add_constant(M[[target_var] + controls_adj+dummies])
        Y = M[emission_var]

        lm = RLM(Y, X, M=HuberT()).fit(cov='H1')
        models.append(lm)

        if standardise_coefficients:           
            params = lm.params.loc[[target_var] + controls_adj]*X[[target_var] + controls_adj].std()/Y.std()
            LB = lm.conf_int()[0].loc[[target_var] + controls_adj]*X[[target_var] + controls_adj].std()/Y.std()
            UB = lm.conf_int()[1].loc[[target_var] + controls_adj]*X[[target_var] + controls_adj].std()/Y.std()
        else:
            params = lm.params.loc[[target_var] + controls_adj]
            LB = lm.conf_int()[0].loc[[target_var] + controls_adj]
            UB = lm.conf_int()[1].loc[[target_var] + controls_adj]


        res = pd.concat((params, lm.pvalues.loc[[target_var] + controls_adj].round(3)), axis = 1)

        #===The columns of the full table is still an independent variable
        fg_ = pd.DataFrame([0]) 
        fg = pd.concat((fg, fg_), axis = 1)

        res = pd.DataFrame([str(np.round(res.loc[target_var][0],2))+ut.utils().significance(res.loc[target_var][1])], index= [index_name], columns = [target_var.capitalize()])
        resNUM = pd.DataFrame([params.loc[target_var], LB.loc[target_var], UB.loc[target_var]]).transpose()
        tab = pd.concat((tab, res), axis = 1)
        tabNUM = pd.concat((tabNUM, resNUM))
        baseline.append([params[target_var], lm.pvalues.loc[target_var]])
    return tab, tabNUM, fg, M, models, baseline


#====================================================================================================
#====================================================================================================


def make_EAR_regression(dt, dummies, emission_var, impact_type='EAR', years_FE=False,dimension_='all'):
    if dimension_ == 'all':
        targets = ['diversification', 'Risk mitigation', \
                   'Stakeholders engagement', 'Innovation']
    elif dimension_ == 'cdp':
        targets = ['diversification']
    controls = ['competition', 
                'Tangibility', 
                'firm_size', 
                'Leverage', 
                'MTB',
                'volatility',
                'ret_usd',
                'DirectControl_lag', 
                'total_effort'] 
    index_name = '' 
    tabNUM = pd.DataFrame()
    tab = pd.DataFrame()
    for target_var in targets:
        M = dt.copy()
        print('Number of observations:', len(M))
        controls_adj = controls.copy()
        
        countries = M['loc'].unique()        
        dummies_tmp = dummies.copy()
        macro_d = pd.get_dummies(M['MacroRegion'], drop_first = True).astype(int)
        dummies_tmp = list(dummies_tmp)+list(macro_d)
        M = pd.concat((M, macro_d), axis = 1)
        for i in countries:
            if i in dummies_tmp: dummies_tmp.remove(i)
            
        if dimension_ == 'all':
            M['total_effort']  = M['total_effort'] .apply(np.log)   
        M['DirectControl_lag'] = scale(M['DirectControl_lag'])

        M = M[[emission_var, target_var] + controls_adj + dummies_tmp + ['gvkey']].dropna()
        dummies_to_keep = M[dummies_tmp].sum()
        dummies_to_keep= list(dummies_to_keep[dummies_to_keep > 10].dropna().index)

        X = M[[target_var] + controls_adj+dummies_to_keep]
        
        
        #=== Scale the independent variables
        X[[target_var] + controls_adj] = scale(X[[target_var] + controls_adj])
        Y = M[emission_var]
        #===
        
        
        
        lm = Probit(Y, X).fit(disp=0)
        
        
        #================================================================================        
        parameter_type = 'marginal'
        #================================================================================

        print('Marginal effects')
        me = lm.get_margeff()
        params_ = me.summary_frame()['dy/dx']
        pvalues = me.summary_frame()['Pr(>|z|)']
        LB = pd.DataFrame(me.conf_int(), index = params_.index)[0].loc[[target_var] + controls_adj]
        UB = pd.DataFrame(me.conf_int(), index = params_.index)[1].loc[[target_var] + controls_adj]
        res = pd.concat((params_, pvalues.loc[[target_var] + controls_adj].round(3)), axis = 1)
        resNUM = pd.DataFrame([params_.loc[target_var], LB.loc[target_var], UB.loc[target_var]]).transpose()



        res = pd.DataFrame([str(np.round(res.loc[target_var][0],2))+ut.utils().significance(res.loc[target_var][1])], index= [index_name], columns = [target_var.capitalize()])
        tabNUM = pd.concat((tabNUM, resNUM))
        tab = pd.concat((tab, res), axis = 1)


    return tab, tabNUM, M




def r2(M, x, Y):
    A = M.predict(x)
    return pearsonr(A, Y)[0]**2

def make_driver_regression(dt, dummies, driver, years_FE=True):
    Z = dt.copy()
    var1, var2 = 'DomainDiversity', 'InstrTypeDiversity'

    targets = ['diversification', 'Risk mitigation', \
                  'Stakeholders engagement', 'Innovation']
    if driver == 'competition':
        controls = [driver,  
                    'Tangibility', 
                    'firm_size',\
                    'MTB', 
                    'total_effort',
                    'Mills_Ratio'
                    ]
    if driver == 'DirectControl_lag':
        controls = [driver,  
                    'inv_int', 
                    'Tangibility', 
                    'firm_size',\
                    'total_effort',
                    'Mills_Ratio']
    elif driver == 'firm_size':
        controls = ['firm_size',  
                    'at_lagged',
                    'total_effort', 
                    'Mills_Ratio']
    elif driver == 'inv_int':
        controls = [driver,  
                    'total_effort', 
                    'firm_size', 
                    'Profitability',
                    'Tangibility', 'MTB',
                    'Mills_Ratio']
    elif driver == 'Tangibility':
        controls = [driver, 
                   'at_usd', 
                   'total_effort', 
                   'Mills_Ratio']
    elif driver == 'MTB':
        controls = [driver, 
                   'at_usd', 
                   'Profitability',
                   'total_effort', 
                   'Mills_Ratio']
    elif driver == 'Profitability':
        controls = [driver,  
                    'firm_size',\
                    'total_effort',
                    'Mills_Ratio']
    elif driver == 'total_effort':
        controls = [driver, 
                    'Profitability', 
                    'MTB', 
                    'Tangibility',
                    'firm_size',\
                    'Leverage',
                    'Mills_Ratio']
    elif driver == 'Leverage':
        controls = [driver, 
                    'Profitability', 
                    'MTB', 'Tangibility',
                    'firm_size',\
                    'total_effort',
                    'Mills_Ratio']
    elif driver == 'volatility':
        controls = [driver, 
                    'Profitability', 
                    'MTB', 'Tangibility', 'ret_usd',
                    'firm_size',\
                    'total_effort',
                    'Mills_Ratio']
    elif driver  in [var1, var2, 'EnvPolicies']:
        controls = [driver, 
                    'Profitability',
                    'inv_int', 
                    'Tangibility', 
                    'firm_size',\
                    'Leverage', 'MTB', 
                    'volatility',
                    'ret_usd',
                    'DirectControl_lag',
                    'total_effort',
                    'Mills_Ratio'] 
        
        
        
    Z['DirectControl_lag_rw'] = scale(Z['DirectControl_lag_rw'])
    
    if driver  == var1:
        controls = [controls[0]] + [T+'_rw' for T in controls[1:-2]] + \
            ['total_effort', 'Mills_Ratio',  var2]
    elif driver == var2:
        controls = [controls[0]] + [T+'_rw' for T in controls[1:-2]] + \
            ['total_effort', 'Mills_Ratio',  var1]

    elif driver != 'total_effort':
        controls = [T+'_rw' for T in controls[:-2]] + ['total_effort', 'Mills_Ratio']
    elif driver == 'total_effort':
        controls = [T+'_rw' for T in controls[:-1]] + ['Mills_Ratio']

    index_name = '' 
    tab = pd.DataFrame()
    tabNUM = pd.DataFrame()
    models = []
    exp_power = []
    if driver in [var1, var2, 'EnvPolicies']:
        #=== Remove country fixed effects as they are accounted for in the policy variables
        alternative_dummies = ['Industrial','Material','Utilities', 'Estimated','Mixed']+[n for n in range(2011,2022)]
    else:
        alternative_dummies = dummies.copy()
    
    for target_var in targets:
        M = Z.copy()
        controls_adj = controls.copy()
        M['total_effort']  = M['total_effort'].apply(np.log)           

        M = M[[target_var] + controls_adj + alternative_dummies + ['gvkey']].dropna()
        X = M[controls_adj+alternative_dummies]
        Y = M[target_var]
        print('Number of observations:', len(X))
        
        lm = RLM(Y, X, M=HuberT()).fit(cov='H1')
        params = lm.params.loc[controls_adj]*X[controls_adj].std()/Y.std()
        LB = lm.conf_int()[0].loc[controls_adj]*X[controls_adj].std()/Y.std()
        UB = lm.conf_int()[1].loc[controls_adj]*X[controls_adj].std()/Y.std()
        models.append(lm)
        exp_power.append([target_var, r2(lm, X, Y)])
       
        if driver not in [var1, var2, 'EnvPolicies', 'envPolDIFF']:
            driver_ = driver+'_rw'
        else:
            driver_ = driver
        res = pd.concat((params, lm.pvalues.loc[controls_adj].round(3)), axis = 1)
        res = pd.DataFrame([str(np.round(res.loc[driver_][0],2))+ut.utils().significance(res.loc[driver_][1])], index= [index_name], columns = [target_var.capitalize()])
        resNUM = pd.DataFrame([params.loc[driver_], LB.loc[driver_], UB.loc[driver_]]).transpose()

        tab = pd.concat((tab, res), axis = 1)
        tabNUM = pd.concat((tabNUM, resNUM))
    return tab, tabNUM, models, M, pd.DataFrame(exp_power)


#====================================================================================================
#====================================================================================================
#====================================================================================================
def make_effort_regression(dt, dummies, emission_var, years_FE=False):
    targets = ['diversification',  'Risk mitigation', \
                  'Stakeholders engagement', 'Innovation']
    controls = ['competition', 
                'Profitability',
                'Tangibility', 
                'inv_int',
                'firm_size',
                'Leverage',
                'DirectControl_lag', 
                'Mills_Ratio']
    
    index_name = '' 
    tab = pd.DataFrame()
    tabNUM = pd.DataFrame()
    fg  = pd.DataFrame()
    models = []
    baseline = []
    for target_var in targets:
        M = dt.copy()
        controls_adj = controls.copy()

        M = M[[emission_var, target_var] + controls_adj + dummies + ['gvkey']].dropna()
        M[emission_var] = M[emission_var].apply(np.log)
        M = M[M[target_var] > -np.inf]

        X = tools.add_constant(M[[target_var ] + controls_adj+dummies])
        Y = M[emission_var]

        lm = RLM(Y, X, M=HuberT()).fit(cov='H1')
        models.append(lm)
        

        params = lm.params.loc[[target_var]]*X[[target_var]].std()/Y.std()
        LB = lm.conf_int()[0].loc[[target_var]]*X[[target_var]].std()/Y.std()
        UB = lm.conf_int()[1].loc[[target_var]]*X[[target_var]].std()/Y.std()

        res = pd.concat((params, lm.pvalues.loc[[target_var]].round(3)), axis = 1)

        res = pd.DataFrame([str(np.round(res.loc[target_var][0],2))+ut.utils().significance(res.loc[target_var][1])], index= [index_name], columns = [target_var.capitalize()])
        resNUM = pd.DataFrame([params.loc[target_var], LB.loc[target_var], UB.loc[target_var]]).transpose()
        tab = pd.concat((tab, res), axis = 1)
        tabNUM = pd.concat((tabNUM, resNUM))
        baseline.append([params[target_var], lm.pvalues.loc[target_var]])
    return tab, tabNUM, 0, M, models, baseline
#====================================================================================================
#====================================================================================================
#====================================================================================================


